data <- read_rds('data-clean/clean_data.rds') %>%
# Select columns
select(ID, visit_number, visit_day,
hivsn_present, pain) We estimated the date of SN onset (‘approximate day’) as the mid-point between two successive visits when a participant was found to have transitioned between SN:no to SN:yes. Otherwise, ‘approximate day’ was taken to be the visit_day.
### Create filter of participants who developed SN
sn_filter <- unique(data$ID[data$hivsn_present == 'yes'])
# SN:yes group
approx_sn <- data %>%
# Filter for participants that developed SN
filter(ID %in% sn_filter) %>%
# Group by ID
group_by(ID) %>%
# Approximate days
mutate(approximate_day = ifelse(visit_number == 1,
yes = 0,
no = ifelse(hivsn_present == 'yes',
yes = (visit_day + lag(visit_day)) / 2,
no = visit_day)),
approximate_day = as.integer(round(approximate_day))) %>%
# Add person years
mutate(visit_year = visit_day / 365.25,
approximate_year = approximate_day / 365.25)
# SN:no group
approx_no_sn <- data %>%
# Filter for participants that did not develop SN
filter(!ID %in% sn_filter) %>%
# Group by ID
group_by(ID) %>%
# approximate days (because no SN developed, use visit_day)
mutate(approximate_day = visit_day,
approximate_day = as.integer(round(approximate_day))) %>%
# Add person years
mutate(visit_year = visit_day / 365.25,
approximate_year = approximate_day / 365.25)
# Put approx_* data frames together
data <- bind_rows(approx_sn, approx_no_sn)Extract 3 and 6 month periods for cumulative incidence.
Note: Data were censored at last screening within each 3/6 month period.
## Generate time filters
months_6 <- round(lubridate::dyears(0.5) / lubridate::ddays(1))
months_3 <- round(lubridate::dyears(0.25) / lubridate::ddays(1))
# Extract SN:yes within 6-month period
data_6months <- data %>%
# Filter values within 6 months
filter(visit_day <= months_6) %>%
# Group by ID
group_by(ID) %>%
# Filter max visit_day
filter(visit_day == max(visit_day))
# Extract SN:yes within first 3-month period
data_3months <- data %>%
# Filter values within 6 months
filter(visit_day <= months_3) %>%
# Group by ID
group_by(ID) %>%
# Filter max visit_day
filter(visit_day == max(visit_day))
# Extract SN:yes within second 3-month period
data_3to6months <- data_6months %>%
# Filter out those with SN with first 3 months
filter(!ID %in% data_3months[data_3months$hivsn_present == 'yes', ]$ID) Only need full 6-month period
# Extract SN:yes for full period
df_6months <- data %>%
# Group by ID
group_by(ID) %>%
# Filter max visit_day
filter(visit_day == max(visit_day))# Create a data frame of SN:yes patients
data_sn.yes <- data %>%
# Select columns
select(ID,
visit_number,
visit_day,
visit_year,
approximate_day,
approximate_year,
hivsn_present,
pain) %>%
# Add counting columns
## SN:yes and SN:no
mutate(sn_count = ifelse(hivsn_present == 'yes',
yes = 1,
no = 0)) %>%
# Create counting index to identify when SN first develop
## Sum sn_count
group_by(ID) %>%
mutate(sn_count = cumsum(sn_count)) %>%
# Filter by sn_count == 1 to get when SN first developed
filter(sn_count == 1) %>%
# Remove sn_count
select(-sn_count)
# Create data frame of SN:no patients
data_sn.no <- data %>%
# Select columns
select(ID,
visit_number,
visit_day,
visit_year,
approximate_day,
approximate_year,
hivsn_present,
pain) %>%
# Filter out SN:yes patient with the filter created earlier
filter(!ID %in% sn_filter) %>%
# Filter out 'repeats' (only the data at the last visit)
group_by(ID) %>%
mutate(max_visits = max(visit_number)) %>%
filter(visit_number == max_visits) %>%
select(-max_visits)
# Merge the SN:yes and SN:no data frames
data_surv <- data_sn.yes %>%
full_join(data_sn.no) %>%
# New column with recoded hivsn_present data for survival analysis
mutate(hivsn_coded = ifelse(hivsn_present == 'yes',
yes = 1, # SN:yes
no = 0)) # SN:no
# Add a new column with painful SN (but only if they have SN)
data_surv <- data_surv %>%
mutate(hivsn_painful = ifelse(hivsn_present == 'yes' & pain == 'yes',
yes = 'yes',
no = ifelse(hivsn_present == 'yes' &
pain == 'no',
yes = 'no',
no = NA))) # Identify and extract information on SN development (at anytime)
# by looking at the presence of SN at the final visit
data_model <- data %>%
select(ID, visit_number, hivsn_present) %>%
group_by(ID) %>%
mutate(max_visit = max(visit_number)) %>%
filter(visit_number == max_visit) %>%
select(ID, hivsn_present) %>%
rename(sn = hivsn_present)
# Join data_sn to data
data_model <- read_rds('data-clean/clean_data.rds') %>%
left_join(data_model) %>%
select(-hivsn_present)
# Restrict data to the baseline visit (visit 1)
data_model %<>%
filter(visit_number == 1)
# Clean-up df_model data frame for analyses
## These data will be used for all the baseline charateristic analyses.
data_model %<>%
# Recode rifafour 'prophylaxis' to 'yes'
mutate(rifafour_treatment = as.character(rifafour_treatment),
rifafour_treatment = ifelse(rifafour_treatment == 'prophylaxis',
yes = 'yes',
no = rifafour_treatment),
rifafour_treatment = factor(rifafour_treatment)) %>%
# Select data that will be modelled
select(sn, age_years, sex, mass_kg, height_m, CD4_cell.ul,
viral_load_copies.ml, alcohol_units.week, TB_current,
rifafour_treatment) Cumulative incidence measures the number of new cases per person in the population over a defined period of time (i.e., a fixed follow-up period). Therefore, to calculate cumulative incidence we defined a fixed 6-month follow-up period, and also subdivided this period into a 1st and 2nd 3-month period of this follow-up. To standardize the periods of follow-up, data were cleaned such that only visits falling into the indicated periods ((0-3 months], (3-6 months], (0-6 months]) were used to define SN status. As such, participant’s whose last clinic visit occurred after 91 days (end of first 3-month interval) or 182 days (end of 6-month interval), and who were found to have new-onset SN at this visit, were recorded as SN:no over the 3 or 6-month period of follow-up. This conservative strategy may have lead to a slight under-estimation of the cumulative incidence of SN.
## Formula for cases / person year
cases_boot <- function(data, i){
foo <- data[i, ]
tab <- table(foo$hivsn_present)
prop <- round(prop.table(tab) * 100)
case <- prop * 10
case[2]
}# Tabulate SN:yes/SN:no
tab_sn <- table(data_6months$hivsn_present)
# Calculate proportions, convert to percent
prop_sn <- round(prop.table(tab_sn) * 100)
# Bootstrap cases per 1000 patients
## Method: BCa, resamples: 10000
cases_ci <- boot.ci(boot.out = boot(data = data_6months,
statistic = cases_boot,
R = 10000,
stype = 'i'),
type = 'bca')
# Create summary table
as.data.frame(tab_sn) %>%
rename(Count = Freq) %>%
left_join(as.data.frame(prop_sn)) %>%
rename(Percentage = Freq,
sn_present = Var1) %>%
bind_cols(`Cases/1000 patients` = c('', paste(cases_ci$t0))) %>%
bind_cols(`Conf.interval` = c('', paste(cases_ci$bca[4],' - ', cases_ci$bca[5])))## sn_present Count Percentage Cases/1000 patients Conf.interval
## 1 no 66 80
## 2 yes 17 20 200 120 - 290
# Tabulate SN:yes/SN:no
tab_sn <- table(data_3months$hivsn_present)
# Calculate proportions, convert to percent
prop_sn <- round(prop.table(tab_sn) * 100)
# Bootstrap cases per 1000 patients
## Method: BCa, resamples: 10000
cases_ci <- boot.ci(boot.out = boot(data = data_3months,
statistic = cases_boot,
R = 1000,
stype = 'i'),
type = 'bca')
# Create summary table
as.data.frame(tab_sn) %>%
rename(Count = Freq) %>%
left_join(as.data.frame(prop_sn)) %>%
rename(Percentage = Freq,
sn_present = Var1) %>%
bind_cols(`Cases/1000 patients` = c('', paste(cases_ci$t0))) %>%
bind_cols(`Conf.interval` = c('', paste0(cases_ci$bca[4],' to ', cases_ci$bca[5])))## sn_present Count Percentage Cases/1000 patients Conf.interval
## 1 no 76 92
## 2 yes 7 8 80 20 to 140
# Tabulate SN:yes/SN:no
tab_sn <- table(data_3to6months$hivsn_present)
# Calculate proportions, convert to percent
prop_sn <- round(prop.table(tab_sn) * 100)
# Bootstrap cases per 1000 patients
## Method: BCa, resamples: 10000
cases_ci <- boot.ci(boot.out = boot(data = data_3to6months,
statistic = cases_boot,
R = 1000,
stype = 'i'),
type = 'bca')
# Create summary table
as.data.frame(tab_sn) %>%
rename(Count = Freq) %>%
left_join(as.data.frame(prop_sn)) %>%
rename(Percentage = Freq,
sn_present = Var1) %>%
bind_cols(`Cases/1000 patients` = c('', paste(cases_ci$t0))) %>%
bind_cols(`Conf.interval` = c('', paste0(cases_ci$bca[4],' to ', cases_ci$bca[5])))## sn_present Count Percentage Cases/1000 patients Conf.interval
## 1 no 66 87
## 2 yes 10 13 130 50 to 210
Incidence rate is a measure of the number of new cases per unit of time. We did not have extact dates of SN onset to define the per patient unit of time, and nor was there uniform spacing between clinic visits. We therefore chose to calculate an approximate SN onset time, arbitrarily defined as the number of days between the first neuropathy screening and the mid-point between the visit when neuropathy was detected and the preceeding visit. For participants who did not develop SN, the date of censoring was defined by the number of days between the first neuropathy screening and the last screening (study exit).
## Formula for cases / person year
boot_df <- function(data, i){
foo <- data[i, ]
cases <- sum(foo$hivsn_present == 'yes')
person_time <- sum(foo$approximate_year)
cases / person_time
}# Bootstrap confidence intervals
df_ci <- boot.ci(boot.out = boot(data = df_6months,
statistic = boot_df,
R = 1000,
stype = 'i'),
type = 'bca')
data_frame(`Cases/person.year` = round(df_ci$t0, 2),
Conf.interval = paste0(round(df_ci$bca[4], 2), ' to ',
round(df_ci$bca[5], 2)))## # A tibble: 1 x 2
## `Cases/person.year` Conf.interval
## <dbl> <chr>
## 1 0.47 0.3 to 0.7
# Basic Kaplan-Meyer (KM)
## No predictors (~ 1)
km_basic <- survfit(Surv(time = approximate_day,
event = hivsn_coded) ~ 1,
data = data_surv)
# Summary of KM fit
km_basic## Call: survfit(formula = Surv(time = approximate_day, event = hivsn_coded) ~
## 1, data = data_surv)
##
## n events median 0.95LCL 0.95UCL
## 83 20 NA NA NA
summary(km_basic)## Call: survfit(formula = Surv(time = approximate_day, event = hivsn_coded) ~
## 1, data = data_surv)
##
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 14 83 1 0.988 0.0120 0.965 1.000
## 22 82 1 0.976 0.0168 0.943 1.000
## 23 81 1 0.964 0.0205 0.925 1.000
## 28 80 2 0.940 0.0261 0.890 0.992
## 31 78 1 0.928 0.0284 0.874 0.985
## 46 77 1 0.916 0.0305 0.858 0.977
## 47 76 1 0.904 0.0324 0.842 0.969
## 51 75 1 0.892 0.0341 0.827 0.961
## 52 74 1 0.880 0.0357 0.812 0.952
## 53 73 1 0.867 0.0372 0.798 0.944
## 55 72 1 0.855 0.0386 0.783 0.935
## 56 71 1 0.843 0.0399 0.769 0.925
## 64 70 1 0.831 0.0411 0.755 0.916
## 73 69 1 0.819 0.0422 0.741 0.906
## 84 68 1 0.807 0.0433 0.727 0.897
## 104 67 1 0.795 0.0443 0.713 0.887
## 132 66 1 0.783 0.0452 0.699 0.877
## 143 65 1 0.771 0.0461 0.686 0.867
## 144 64 1 0.759 0.0469 0.672 0.857
# Plot
ggsurvplot(km_basic,
conf.int = TRUE,
fun = 'event',
risk.table = 'abs_pct',
cumcensor = TRUE,
cumevents = TRUE,
ggtheme = theme_bw(),
tables.theme = theme(plot.title = element_text(size = 12,
face = 'bold'),
panel.border = element_blank(),
panel.grid.major.x = element_blank(),
axis.title = element_blank(),
axis.text.x = element_blank(),
axis.ticks = element_blank()),
fontsize = 4,
tables.height = 0.1,
legend = 'none',
palette = '#0072B2',
ylim = c(0, 1),
xlab = 'Days',
title = 'Kaplan-Meier Survival Curve: SN vs Days')# Publication plot
## New plot theme
theme_new <- function(){
theme_bw(base_size = 18) +
theme(axis.text = element_text(size = 22,
colour = '#000000'),
axis.title = element_text(size = 24,
colour = '#000000'),
panel.grid.major.x = element_line(size = 1,
colour = '#CCCCCC'),
panel.border = element_rect(size = 1.5),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.minor.x = element_blank())
}
## Plot
gg_plot <- ggsurvplot(km_basic,
fun = 'event',
conf.int = TRUE,
conf.int.fill = '#666666',
risk.table = 'abs_pct',
cumcensor = TRUE,
cumevents = TRUE,
tables.theme = theme(plot.title = element_text(size = 22,
face = 'bold'),
panel.border = element_blank(),
panel.grid.major.x = element_blank(),
axis.title = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank()),
fontsize = 7,
risk.table.height = 0.08,
cumevents.height = 0.08,
cumcensor.height = 0.08,
xlab = 'Days',
ylab = 'Cumulative proportion\n',
ylim = c(0, 1),
palette = c('#000000'),
legend = 'none',
ggtheme = theme_new())
## Save
ggsave(filename = 'survival-curve.png',
plot = print(gg_plot),
width = 12.7,
height = 12)In addition we performed a log-rank test to asssess for differences between painful and non-painful SN strata.
# Incidence ~ pain
km_pain <- survfit(Surv(time = approximate_day,
event = hivsn_coded) ~ pain,
data = data_surv)
# Summary
km_pain## Call: survfit(formula = Surv(time = approximate_day, event = hivsn_coded) ~
## pain, data = data_surv)
##
## n events median 0.95LCL 0.95UCL
## pain=no 65 11 NA NA NA
## pain=yes 18 9 104 55 NA
summary(km_pain)## Call: survfit(formula = Surv(time = approximate_day, event = hivsn_coded) ~
## pain, data = data_surv)
##
## pain=no
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 23 65 1 0.985 0.0153 0.955 1.000
## 28 64 1 0.969 0.0214 0.928 1.000
## 31 63 1 0.954 0.0260 0.904 1.000
## 47 62 1 0.938 0.0298 0.882 0.999
## 51 61 1 0.923 0.0331 0.861 0.990
## 53 60 1 0.908 0.0359 0.840 0.981
## 56 59 1 0.892 0.0384 0.820 0.971
## 64 58 1 0.877 0.0407 0.801 0.961
## 132 57 1 0.862 0.0428 0.782 0.950
## 143 56 1 0.846 0.0448 0.763 0.939
## 144 55 1 0.831 0.0465 0.744 0.927
##
## pain=yes
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 14 18 1 0.944 0.0540 0.844 1.000
## 22 17 1 0.889 0.0741 0.755 1.000
## 28 16 1 0.833 0.0878 0.678 1.000
## 46 15 1 0.778 0.0980 0.608 0.996
## 52 14 1 0.722 0.1056 0.542 0.962
## 55 13 1 0.667 0.1111 0.481 0.924
## 73 12 1 0.611 0.1149 0.423 0.883
## 84 11 1 0.556 0.1171 0.368 0.840
## 104 10 1 0.500 0.1179 0.315 0.794
# Log-rank test
survdiff(Surv(time = visit_day,
event = hivsn_coded) ~ pain,
data = data_surv)## Call:
## survdiff(formula = Surv(time = visit_day, event = hivsn_coded) ~
## pain, data = data_surv)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## pain=no 65 11 16.68 1.93 11.9
## pain=yes 18 9 3.32 9.72 11.9
##
## Chisq= 11.9 on 1 degrees of freedom, p= 6e-04
# Plot
ggsurvplot(km_pain,
conf.int = TRUE,
fun = 'event',
risk.table = 'abs_pct',
cumcensor = TRUE,
cumevents = TRUE,
surv.median.line = 'hv',
pval = TRUE,
ggtheme = theme_bw(),
tables.theme = theme(plot.title = element_text(size = 12,
face = 'bold'),
panel.border = element_blank(),
panel.grid.major.x = element_blank(),
axis.title = element_blank(),
axis.text.x = element_blank(),
axis.ticks = element_blank()),
fontsize = 4,
tables.height = 0.14,
legend = 'none',
legend.labs = c('Non-painful SN', 'Painful SN'),
palette = c('#0072B2', '#D55E00'),
ylim = c(0, 1),
xlab = 'Days',
title = 'Kaplan-Meier Survival Curve: SN vs Days')In an exploratory analysis using Cox proportional hazard models (not shown here) various predictors violated assumptions of the model (e.g., proportional hazard, linearity, no influence points). Therefore we used logistic regression modelling, with visit 1 charateristics as predictors of SN onset.
Note: Only complete datasets used (n = 77)
The model does not include:
diabetes_hba1c because nobody had HbA1c > 7%.
vitaminB12_deficiency because only one individual had a deficiency.
mass_kg because it is likely to be co-linear with height_m.
consumes_alcohol because this information was deemed to be encoded in alcohol_units.week.
# Double-check presumed 'mass_kg' vs 'height_m' relationship before proceeding
ggplot(data = data_model) +
aes(x = height_m,
y = mass_kg,
colour = sex) +
geom_point() +
geom_smooth(method = 'lm') +
scale_colour_manual(values = c('#000000', '#999999')) +
labs(title = 'Relationship between body mass and height') +
theme_bw()# Males
with(data_model[data_model$sex == 'M', ], cor.test(height_m, mass_kg))##
## Pearson's product-moment correlation
##
## data: height_m and mass_kg
## t = 0.99029, df = 35, p-value = 0.3288
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1679090 0.4642755
## sample estimates:
## cor
## 0.165092
# Females
with(data_model[data_model$sex == 'F', ], cor.test(height_m, mass_kg))##
## Pearson's product-moment correlation
##
## data: height_m and mass_kg
## t = 4.7005, df = 44, p-value = 2.575e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3459427 0.7436566
## sample estimates:
## cor
## 0.5781765
The two variables are related in females, but not in males. We decided to omit weight_kg in favour of height_m because of this relationship, and the stronger historical association between height and SN.
We chose to use LASSO (least absolute shrinkage and selection operator) for variable selection. LASSO is a regression analysis method that performs both variable selection and regularization in order to enhance the prediction accuracy and interpretability of the statistical model it produces.
The process involves performing a 10-fold cross validation to find the optimal lambda (penalization parameter). And then running the analysis and extracting the model based on the best lambda.
# Remove mass_kg
data_model %<>%
select(-mass_kg)
cvafit <- cva.glmnet(sn ~ .,
data = data_model,
family = 'binomial')
# Print CVA object
print(cvafit)## Call:
## cva.glmnet.formula(formula = sn ~ ., data = data_model, family = "binomial")
##
## Model fitting options:
## Sparse model matrix: FALSE
## Use model.frame: FALSE
## Alpha values: 0 0.001 0.008 0.027 0.064 0.125 0.216 0.343 0.512 0.729 1
## Number of crossvalidation folds for lambda: 10
# Plot CVA object log(lambda) vs CV loss (deviance) for each value of alpha
plot(cvafit)# Plot minimum CV loss (deviance) for each value of alpha
minlossplot(cvafit)# Print model at each of the four lowest CV loss alphas
## alpha = 0.216
plot(cvafit$modlist[[7]])
title('alpha = 0.216', line = 2.5)## alpha = 0.343
plot(cvafit$modlist[[8]])
title('alpha = 0.343', line = 2.5)## alpha = 0.512
plot(cvafit$modlist[[9]])
title('alpha = 0.512', line = 2.5)## Alpha = 0.729
plot(cvafit$modlist[[10]])
title('alpha = 0.729', line = 2.5)## and alpha = 1
plot(cvafit$modlist[[11]])
title('alpha = 1.0', line = 2.5)# Extract best lambda for each alpha (lambda.1se)
## alpha = 0.216
a216 <- cvafit$modlist[[7]]$lambda.1se
## alpha = 0.343
a343 <- cvafit$modlist[[8]]$lambda.1se
## alpha = 0.512
a512 <- cvafit$modlist[[9]]$lambda.1se
## Alpha = 0.729
a729 <- cvafit$modlist[[10]]$lambda.1se
## and alpha = 1
a100 <- cvafit$modlist[[11]]$lambda.1se# Fit the models for each alpha
## alpha = 0.216
model_a216 <- glmnet(sn ~ .,
data = data_model,
alpha = 0.216,
family = 'binomial')
## alpha = 0.343
model_a343 <- glmnet(sn ~ .,
data = data_model,
alpha = 0.343,
family = 'binomial')
## alpha = 0.512
model_a512 <- glmnet(sn ~ .,
data = data_model,
alpha = 0.512,
family = 'binomial')
## alpha = 0.729
model_a729 <- glmnet(sn ~ .,
data = data_model,
alpha = 0.729,
family = 'binomial')
## alpha = 1.0
model_a100 <- glmnet(sn ~ .,
data = data_model,
alpha = 1,
family = 'binomial')
# Plot and get coefficients for each model
## alpha = 0.216
plot(model_a216,
xvar = 'lambda',
label = TRUE)
abline(v = log(a216))
title('alpha = 0.216', line = 2.5)coef(model_a216, s = a216)## 12 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -4.0315285514
## age_years 0.0002568548
## sexF .
## sexM .
## height_m 1.8587897666
## CD4_cell.ul .
## viral_load_copies.ml .
## alcohol_units.week .
## TB_currentno -0.2265441243
## TB_currentyes 0.2266323891
## rifafour_treatmentno .
## rifafour_treatmentyes .
## alpha = 0.343
plot(model_a343,
xvar = 'lambda',
label = TRUE)
abline(v = log(a343))
title('alpha = 0.343', line = 2.5)coef(model_a343, s = a343)## 12 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -4.6799069
## age_years .
## sexF .
## sexM .
## height_m 2.2608756
## CD4_cell.ul .
## viral_load_copies.ml .
## alcohol_units.week .
## TB_currentno -0.2292115
## TB_currentyes 0.2293869
## rifafour_treatmentno .
## rifafour_treatmentyes .
## alpha = 0.512
plot(model_a512,
xvar = 'lambda',
label = TRUE)
abline(v = log(a512))
title('alpha = 0.512', line = 2.5)coef(model_a512, s = a512)## 12 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -5.2242726
## age_years .
## sexF .
## sexM .
## height_m 2.5862299
## CD4_cell.ul .
## viral_load_copies.ml .
## alcohol_units.week .
## TB_currentno -0.2108813
## TB_currentyes 0.2108904
## rifafour_treatmentno .
## rifafour_treatmentyes .
## alpha = 0.729
plot(model_a729,
xvar = 'lambda',
label = TRUE)
abline(v = log(a729))
title('alpha = 0.729', line = 2.5)coef(model_a729, s = a729)## 12 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -5.6289780
## age_years .
## sexF .
## sexM .
## height_m 2.8204092
## CD4_cell.ul .
## viral_load_copies.ml .
## alcohol_units.week .
## TB_currentno -0.1746889
## TB_currentyes 0.1742475
## rifafour_treatmentno .
## rifafour_treatmentyes .
## alpha = 1.0
plot(model_a100,
xvar = 'lambda',
label = TRUE)
abline(v = log(a100))
title('alpha = 1.0', line = 2.5)coef(model_a100, s = a100)## 12 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -5.754625e+00
## age_years .
## sexF .
## sexM .
## height_m 2.952719e+00
## CD4_cell.ul .
## viral_load_copies.ml .
## alcohol_units.week .
## TB_currentno -2.433997e-01
## TB_currentyes 2.409225e-15
## rifafour_treatmentno .
## rifafour_treatmentyes .
Across all alphas, and best lambdas, the output shows the best model includes height_m and TB_current.
Fit trimmed model using selected variables
mod_final <- glm(sn ~ height_m + TB_current,
data = data_model,
family = binomial(link = 'logit'))
## Generate model summaries
### (Type II SS ANOVA shows significant effect for height_m, TB_current)
### (NOTE: CI of odds ratios for height_m and TB_current are wide)
print(mod_final)##
## Call: glm(formula = sn ~ height_m + TB_current, family = binomial(link = "logit"),
## data = data_model)
##
## Coefficients:
## (Intercept) height_m TB_currentyes
## -19.27 10.72 1.43
##
## Degrees of Freedom: 82 Total (i.e. Null); 80 Residual
## Null Deviance: 91.66
## Residual Deviance: 74 AIC: 80
car::Anova(mod_final)## Analysis of Deviance Table (Type II tests)
##
## Response: sn
## LR Chisq Df Pr(>Chisq)
## height_m 7.7696 1 0.005313 **
## TB_current 5.0998 1 0.023929 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(mod_final)##
## Call:
## glm(formula = sn ~ height_m + TB_current, family = binomial(link = "logit"),
## data = data_model)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3935 -0.6913 -0.4096 -0.2304 2.4169
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -19.2742 7.0573 -2.731 0.00631 **
## height_m 10.7248 4.2267 2.537 0.01117 *
## TB_currentyes 1.4297 0.6357 2.249 0.02452 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 91.663 on 82 degrees of freedom
## Residual deviance: 73.996 on 80 degrees of freedom
## AIC: 79.996
##
## Number of Fisher Scoring iterations: 5
OR(mod_final, what = 'all')[1:3, c(1, 2, 6:8)] ## model logit OR lowOR upOR
## 1: (Intercept) -19.274156 4.259322e-09 1.304270e-11 1.390956e-06
## 2: height_m 10.724798 4.546955e+04 1.392348e+02 1.484887e+07
## 3: TB_currentyes 1.429731 4.177577e+00 1.279239e-02 1.364261e+03
# Diagnostic measures for a binary regression model by covariate pattern
data_dx <- dx(mod_final, byCov = TRUE)
# Outliers and leverage
## Plot 'dChisq' for each probability (P).
### 'DChisq' gives the change in the Pearson chi-square statistic with
### observation i removed.
### Values should be < 4 if the observation has little influence on the model.
(gg_out <- ggplot(data = data_dx) +
aes(x = P,
y = dChisq) +
geom_point(shape = 21,
size = 2,
fill = '#FFFFFF') +
geom_hline(yintercept = 4,
linetype =2))### Label dodgy point with height and TB info
gg_out +
geom_label(label = data_dx %>%
filter(dChisq == max(dChisq)) %>%
mutate(new = str_glue('height_m: {height_m}, TB(yes): {TB_currentyes}')) %>%
.$new,
x = data_dx %>%
filter(dChisq == max(dChisq)) %>%
.$P,
y = data_dx %>%
filter(dChisq == max(dChisq)) %>%
.$dChisq,
hjust = -0.1)### Influence plot
influenceIndexPlot(mod_final,
id.n = 3,
id.col = 'red')influencePlot(mod_final,
id.col = 'red')## StudRes Hat CookD
## 6 2.5024065 0.02341176 0.14362726
## 45 -1.0571673 0.08129135 0.02233145
## 47 2.5024065 0.02341176 0.14362726
## 61 -0.6759985 0.11187100 0.01125179
### Remove influence points one at a time
mod_influence6 <- update(mod_final,
subset = c(-6))
mod_influence46 <- update(mod_final,
subset = c(-46))
mod_influence58 <- update(mod_final,
subset = c(-58))
### Compare coefficients with and without influence points
compareCoefs(mod_final, mod_influence6)## Calls:
## 1: glm(formula = sn ~ height_m + TB_current, family = binomial(link =
## "logit"), data = data_model)
## 2: glm(formula = sn ~ height_m + TB_current, family = binomial(link =
## "logit"), data = data_model, subset = c(-6))
##
## Model 1 Model 2
## (Intercept) -19.27 -24.07
## SE 7.06 8.11
##
## height_m 10.72 13.52
## SE 4.23 4.82
##
## TB_currentyes 1.430 1.513
## SE 0.636 0.652
##
compareCoefs(mod_final, mod_influence46)## Calls:
## 1: glm(formula = sn ~ height_m + TB_current, family = binomial(link =
## "logit"), data = data_model)
## 2: glm(formula = sn ~ height_m + TB_current, family = binomial(link =
## "logit"), data = data_model, subset = c(-46))
##
## Model 1 Model 2
## (Intercept) -19.27 -19.05
## SE 7.06 7.03
##
## height_m 10.72 10.60
## SE 4.23 4.21
##
## TB_currentyes 1.430 1.418
## SE 0.636 0.636
##
compareCoefs(mod_final, mod_influence58)## Calls:
## 1: glm(formula = sn ~ height_m + TB_current, family = binomial(link =
## "logit"), data = data_model)
## 2: glm(formula = sn ~ height_m + TB_current, family = binomial(link =
## "logit"), data = data_model, subset = c(-58))
##
## Model 1 Model 2
## (Intercept) -19.27 -19.07
## SE 7.06 7.09
##
## height_m 10.72 10.60
## SE 4.23 4.25
##
## TB_currentyes 1.430 1.427
## SE 0.636 0.635
##
## Check '6' and '46' for anything strange
data_model %>%
select(sn, height_m, TB_current) %>%
.[6, ]## # A tibble: 1 x 3
## sn height_m TB_current
## <fct> <dbl> <fct>
## 1 yes 1.53 no
data_model %>%
select(sn, height_m, TB_current) %>%
.[46, ]## # A tibble: 1 x 3
## sn height_m TB_current
## <fct> <dbl> <fct>
## 1 no 1.61 no
# Test for multicolinearity using variance inflation factor (vif)
## Rule of thumb: values > 4 are a problem
vif(mod_final) ## height_m TB_current
## 1.0178 1.0178
# Test for correlated residuals
## p-vale > 0.05: no correlation between residuals
durbinWatsonTest(mod_final)## lag Autocorrelation D-W Statistic p-value
## 1 -0.056383 2.105132 0.626
## Alternative hypothesis: rho != 0
# Plot 'dBhat' for each probability (P).
## 'dBhat' is the change in beta coefficient with observation i excluded.
## 'dBhat' should be < 1 if the i has little influence on the coefficients.
ggplot(data = data_dx) +
aes(x = P,
y = dBhat) +
geom_point(shape = 21,
size = 2,
fill = '#FFFFFF') +
geom_hline(yintercept = 1,
linetype =2)# Check goodness of fit (Hosmer-Lemeshow goodness of fit test)
## Significant p-values indicate a poor fit
## ROC 0.7 < auc < 0.8 indicates acceptable discrimination
gof(mod_final, g = 8)## chiSq df pVal
## PrI 90.992 80 0.18827
## drI 73.996 80 0.66785
## PrG 28.008 36 0.82691
## drG 29.809 36 0.75690
## PrCT 28.008 36 0.82691
## drCT 29.809 36 0.75690
## val df pVal
## HL chiSq 4.66186 6 0.58785
## mHL F 0.63294 7 0.72505
## OsRo Z NA NA 0.33399
## SstPgeq0.5 Z 1.21921 NA 0.22276
## SstPl0.5 Z 1.54241 NA 0.12297
## SstBoth chiSq 3.86551 2 0.14475
## SllPgeq0.5 chiSq 2.64901 1 0.10361
## SllPl0.5 chiSq 2.35301 1 0.12504
## SllBoth chiSq 3.08826 2 0.21350
sessionInfo()## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS 10.14
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] bindrcpp_0.2.2 LogisticDx_0.2 survminer_0.4.3 ggpubr_0.2
## [5] survival_2.43-1 car_3.0-2 carData_3.0-2 boot_1.3-20
## [9] glmnetUtils_1.1 forcats_0.3.0 stringr_1.3.1 dplyr_0.7.8
## [13] purrr_0.2.5 readr_1.1.1 tidyr_0.8.2 tibble_1.4.2
## [17] ggplot2_3.1.0 tidyverse_1.2.1 magrittr_1.5
##
## loaded via a namespace (and not attached):
## [1] TH.data_1.0-9 colorspace_1.3-2 rio_0.5.10
## [4] rprojroot_1.3-2 htmlTable_1.12 base64enc_0.1-3
## [7] rstudioapi_0.8 MatrixModels_0.4-1 fansi_0.4.0
## [10] mvtnorm_1.0-8 lubridate_1.7.4 xml2_1.2.0
## [13] codetools_0.2-15 splines_3.5.1 knitr_1.20
## [16] Formula_1.2-3 jsonlite_1.5 speedglm_0.3-2
## [19] pROC_1.13.0 broom_0.5.0 km.ci_0.5-2
## [22] cluster_2.0.7-1 aod_1.3 compiler_3.5.1
## [25] httr_1.3.1 backports_1.1.2 assertthat_0.2.0
## [28] Matrix_1.2-15 lazyeval_0.2.1 cli_1.0.1
## [31] acepack_1.4.1 htmltools_0.3.6 quantreg_5.36
## [34] tools_3.5.1 gtable_0.2.0 glue_1.3.0
## [37] Rcpp_1.0.0 cellranger_1.1.0 nlme_3.1-137
## [40] iterators_1.0.10 openxlsx_4.1.0 rvest_0.3.2
## [43] statmod_1.4.30 polspline_1.1.13 MASS_7.3-51.1
## [46] zoo_1.8-4 scales_1.0.0 hms_0.4.2
## [49] parallel_3.5.1 sandwich_2.5-0 SparseM_1.77
## [52] RColorBrewer_1.1-2 yaml_2.2.0 curl_3.2
## [55] gridExtra_2.3 KMsurv_0.1-5 rms_5.1-2
## [58] rpart_4.1-13 latticeExtra_0.6-28 stringi_1.2.4
## [61] foreach_1.4.4 checkmate_1.8.5 zip_1.0.0
## [64] rlang_0.3.0.1 pkgconfig_2.0.2 evaluate_0.12
## [67] lattice_0.20-38 bindr_0.1.1 labeling_0.3
## [70] htmlwidgets_1.3 cmprsk_2.2-7 tidyselect_0.2.5
## [73] plyr_1.8.4 R6_2.3.0 Hmisc_4.1-1
## [76] multcomp_1.4-8 pillar_1.3.0 haven_1.1.2
## [79] foreign_0.8-71 withr_2.1.2.9000 abind_1.4-5
## [82] nnet_7.3-12 modelr_0.1.2 crayon_1.3.4
## [85] survMisc_0.5.5 utf8_1.1.4 rmarkdown_1.10
## [88] grid_3.5.1 readxl_1.1.0 data.table_1.11.8
## [91] digest_0.6.18 xtable_1.8-3 munsell_0.5.0
## [94] glmnet_2.0-16